# ###################################################################################
# Bilateral IPE Data Master Merge
# Miriam Barnum
# Fall 2022/Spring 2023 Update: Gaea Morales 
#####################################################################################

## NOTE TO GM
# make sure COW alliances is merged 
# check all the datasets pre 1960 (1815/1816 start) - missing country names
  # years before 1960 are missing country names so append_ids is dropping them
  # WAR
  # ATOP
  # COW
  # MID

#library(foreign)
#library(readstata13)
library(tidyverse)
library(parallel) 
library(peacesciencer)

options(mc.cores = parallel::detectCores())

# Organizational note
  # Kept data that was not updated sourced from Master Bilateral Data v2 folder
  # Created copies of updated raw data, prep scripts, and RDATA files (e.g., from Summer 2021) into the separate Master Bilateral Data v3 folders

# preppeddata path is key filepath for most prepped datasets 
dofiles = "/Volumes/GoogleDrive-106129646335010644812/.shortcut-targets-by-id/0B2iwvlsRgy8tQXlVNll5WWhLcGs/Master IPE Data/Master Bilateral Data v2/prep scripts/"
prepscripts = "/Volumes/GoogleDrive-106129646335010644812/.shortcut-targets-by-id/0B2iwvlsRgy8tQXlVNll5WWhLcGs/Master IPE Data/Master Bilateral v3/prep scripts/"
preppeddata = "/Users/gaeamorales/Library/CloudStorage/GoogleDrive-gaeapatr@usc.edu/.shortcut-targets-by-id/0B2iwvlsRgy8tQXlVNll5WWhLcGs/Master IPE Data/Master Bilateral v3/prepped data/"
rawdatav2 = "/Volumes/GoogleDrive-106129646335010644812/.shortcut-targets-by-id/0B2iwvlsRgy8tQXlVNll5WWhLcGs/Master IPE Data/Master Bilateral Data v2/raw data/"
rawdata = "/Volumes/GoogleDrive-106129646335010644812/.shortcut-targets-by-id/0B2iwvlsRgy8tQXlVNll5WWhLcGs/Master IPE Data/Master Bilateral v3/raw data/"

ids_path <- '/Volumes/GoogleDrive-106129646335010644812/.shortcut-targets-by-id/1JpiIrqOeS1K9UlbREPhw_iziB-QxPH91/append_ids/'

# --- Load all datasets

# Annotated with "RDATA" object names 

# Load datasets 

#load(paste(preppeddatav2,"PREPPED_ALL_MZ_102517.RDATA",sep="")) # "all"
  # now using peacesciencer for COW Alliances 

load(paste(preppeddata,"baci_prepped_MBAK_07142021.RData", sep="")) # trade.final 
baci <- trade.final

# bit <- readRDS(paste(preppeddata, "PREPPED_BIT_MB_GM_08202022.rds", sep="")) #bit 
  #   # not dyadic, early .dta file set up as country-year

cdis <- readRDS(paste(preppeddata, "PREPPED_CDIS_MB_YC_GM_092122.RDS", sep="")) # "cdis" 

load(paste(preppeddata,"PREPPED_CEPII_MB_121018.RDATA", sep="")) # "dist" - CEPII data time invariant
cepii <- dist 

cpis <- readRDS(paste(preppeddata, "PREPPED_CPIS_BT_YC_06302022.RDS", sep="")) # "cpis" 
length(unique(cpis$repgwno))

bea <- readRDS(paste(preppeddata, "PREPPED_FDI_BEA_BT_YC_06292022.RDS", sep="")) # "bea"

dots <- readRDS(paste(preppeddata, "PREPPED_DOTS_RH_GM_04042022.RDS", sep="")) # "dots"

load(paste(preppeddata,"PREPPED_FDI_OECD_AST_LIAB_BT_070921.RDATA",sep="")) # "fdi_oecd"

get <- readRDS(paste(preppeddata, "PREPPED_GET_RH_01292022.RDS",sep="")) # "get"

igo <- readRDS(paste(preppeddata, "PREPPED_IGO_GM_01152023.RDS", sep="")) # "igo"

load(paste(preppeddata, "PREPPED_MID_BT_071221.RDATA", sep="")) # "mids"

# load(paste(preppeddata, "prepped_MIG_RH_01182022.RDATA", sep="")) # "mig5"
# mig <- mig5

load(paste(preppeddata, "PREPPED_MIG_OECD_RH_02082021.RDATA", sep="")) # "df"
mig_oecd <- df 

load(paste(preppeddata,"PREPPED_MIG_UN_RH_02262021.RDATA",sep="")) # "un2"
mig_un <- un2

pbl <- readRDS(paste(preppeddata,"PREPPED_PBL_DD_MEB_GM_04042022.rds", sep="")) 

pta <- readRDS(paste(preppeddata, "PREPPED_PTA_RH_01232022.RDS", sep="")) 

load(paste(preppeddata,"PREPPED_REF_RH_02182021.RDATA",sep="")) # "unhcr2"
ref <- unhcr2

load(paste(preppeddata,"PREPPED_TH_MB_030518.RDATA",sep="")) # "th"

load(paste(preppeddata,"PREPPED_WAR_DC_071321.RDATA",sep="")) # "war"

wb <- readRDS(paste(preppeddata, "PREPPED_MIG_WB_AK_11182021.RDS", sep=""))

wbmr <- readRDS(paste(preppeddata, "PREPPED_WBMR_CS_07132020.rds",sep=""))

load(paste(preppeddata,"WDI_PWT_W_CGR_BT_063021.RDATA", sep="")) # "wdi_pwt"
wdi_pwt %>% 
  select(repgwno, pargwno, year, "growth_rep_WDI_PWT" = growth_WDI_PW_rep,
         "growth_par_WDI_PWT" = growth_WDI_PW_par)

unct <- readRDS(paste(preppeddata, "PREPPED_UNCT_GM_08202022.rds", sep = ""))

load(paste(preppeddata, "Prepped_ATOP_RH_02022021.RDATA", sep ="")) # atop
atop <- ATOP3

cowt <- readRDS(paste(preppeddata, "PREPPED_COWT_GM_07282022.RDS", sep = ""))

load(paste(preppeddata, "PREPPED_UNV_RH_12012021.RDATA", sep = ""))
unv <- unv3

cgr <- readRDS(paste(preppeddata, "cor_growth_rates_pwt_MB_211206.rds", sep = "")) %>% 
  select(repgwno, pargwno, year, "cgr_10_pwt_mean_CGR" = cgr_10_pwt_mean, 
         "cgr_10_pwt_sd_CGR" = cgr_10_pwt_sd)

load(paste(preppeddata, "PREPPED_COMMONPEG_VFC_11252022.RDATA", sep=""))
irk <- irk_dyad %>% 
  rename("common_peg_IRK" = common_peg)

ck <- readRDS(paste(preppeddata, "prepped_ESIM_CK_YC_11182022.rds", sep =""))

## read in and merge COW alliances using peacesciencer

source(paste(ids_path,"append_ids.R",sep=""))
names(all)
all <- cow_ddy %>% add_cow_alliance() %>% 
  mutate(repcountry = countrycode::countrycode(ccode1, origin = "cown", destination = "country.name"),
         parcountry = countrycode::countrycode(ccode2, origin = "cown", destination = "country.name")) %>% 
  # manually add missing cowns
  mutate(repcountry = ifelse(ccode1 == "260", "German Federal Republic", repcountry),
         repcountry = ifelse(ccode1 == "730", "Korea", repcountry),
         parcountry = ifelse(ccode2 == "260", "German Federal Republic", parcountry),
         parcountry = ifelse(ccode2 == "730", "Korea", parcountry)) %>% 
  append_ids(dyad = T, breaks = F) %>% 
  append_suffix(dyad = T, "ALL") %>% 
  select(-contains("raw"))

names(all)

dataList <- list(27)
dataList[[1]] = all
dataList[[2]] = atop
dataList[[3]] = baci
dataList[[4]] = bea
dataList[[5]] = dots
dataList[[6]] = cdis
dataList[[7]] = cgr
dataList[[8]] = ck
dataList[[9]] = cowt
dataList[[10]] = cpis
dataList[[11]] = fdi_oecd
dataList[[12]] = get
dataList[[13]] = irk
dataList[[14]] = igo
dataList[[15]] = mids
dataList[[16]] = mig_oecd
dataList[[17]] = mig_un
dataList[[18]] = pbl
dataList[[19]] = pta
dataList[[20]] = ref
dataList[[21]] = th
dataList[[22]] = unct 
dataList[[23]] = unv
dataList[[24]] = war
dataList[[25]] = wb
dataList[[26]] = wbmr
dataList[[27]] = wdi_pwt

# clean environment (to reduce memory pressure)
rm(all,atop,baci,bea,cdis,cgr,ck,cpis,fdi_oecd,irk,irk_dyad,get,mids,mig_oecd,mig_un,pbl,pta,ref,
   th,war,wb,wbmr,wdi_pwt)
  
# --- Merge
ipe <- dataList[[1]]

for (i in 2:length(dataList)) {
  newdata = dataList[[i]]
  
  print(i)
  ipe = merge(ipe, newdata, 
              by = c("repgwno", "pargwno", "year"), all = TRUE)
  
  # Drop duplicated columns
  ipe$repcountry.x = NULL
  ipe$repccode.x = NULL
  ipe$repifs.x = NULL
  ipe$repifscode.x = NULL
  ipe$repgwabbrev.x = NULL
  
  ipe$repcountry.y = NULL
  ipe$repccode.y = NULL
  ipe$repifs.y = NULL
  ipe$repifscode.y = NULL
  ipe$repgwabbrev.y = NULL
  
  ipe$parcountry.x = NULL
  ipe$parccode.x = NULL
  ipe$parifs.x = NULL
  ipe$parifscode.x = NULL
  ipe$pargwabbrev.x = NULL
  
  ipe$parcountry.y = NULL
  ipe$parccode.y = NULL
  ipe$parifs.y = NULL
  ipe$parifscode.y = NULL
  ipe$pargwabbrev.y = NULL
  
}


# save a backup
# save(ipe, file = paste(preppeddata, "bilat_IPE_backup_tmp_GM_04042022.RDATA", sep=""))
saveRDS(ipe, file = paste(preppeddata, "bilat_IPE_backup_tmp_GM_01102023.RDS", sep=""))

#ipe <- readRDS("/Volumes/GoogleDrive-106129646335010644812/.shortcut-targets-by-id/0B2iwvlsRgy8tQXlVNll5WWhLcGs/Master IPE Data/Master Bilateral v3/prepped data/bilat_IPE_backup_tmp_GM_01102023.RDS")
# load(paste(preppeddata, "bilat_IPE_backup_tmp_GM_07212021.RDATA",sep=""))

# Merge CEPII distance data with IPE (time invariant, so don't merge on year)
ipe <- merge(ipe, cepii,
             by = c("repgwno", "pargwno"), 
             all = TRUE)  

ipe$year.y = NULL
names(ipe)[names(ipe) == "year.x"] <- "year"

ipe$repcountry = NULL
ipe$repccode = NULL
ipe$repifs = NULL
ipe$repifscode = NULL
ipe$repgwabbrev = NULL

ipe$parcountry = NULL
ipe$parccode = NULL
ipe$parifs = NULL
ipe$parifscode = NULL
ipe$pargwabbrev = NULL

ipe <- ipe %>% 
  arrange(repgwno, pargwno, year) %>% 
  filter(!is.na(year))

#### Standardize Country Names by GWNO  ####
#load("raw-data v2/Standard_gwno_country_ifs.RDATA")
load(paste(preppeddata, "Standard_gwno_country_ifs.RDATA", sep = ""))
gwnoCountry$gwno <- as.numeric(gwnoCountry$gwno)

# Change all the country names to standard ones
ipe_bilat <- merge(ipe, gwnoCountry, by.y = "gwno", by.x = "pargwno", all.x = TRUE)
names(ipe_bilat)[names(ipe_bilat) == "countryName"] <- "parcountry"
names(ipe_bilat)[names(ipe_bilat) == "ifs"] <- "parifs"
names(ipe_bilat)[names(ipe_bilat) == "ifscode"] <- "parifscode"
names(ipe_bilat)[names(ipe_bilat) == "ccode"] <- "parccode"
names(ipe_bilat)[names(ipe_bilat) == "gwabbrev"] <- "pargwabbrev"

ipe_bilat <- merge(ipe_bilat, gwnoCountry, by.y = "gwno", by.x = "repgwno", all.x = TRUE)
names(ipe_bilat)[names(ipe_bilat) == "countryName"] <- "repcountry"
names(ipe_bilat)[names(ipe_bilat) == "ifs"] <- "repifs"
names(ipe_bilat)[names(ipe_bilat) == "ifscode"] <- "repifscode"
names(ipe_bilat)[names(ipe_bilat) == "ccode"] <- "repccode"
names(ipe_bilat)[names(ipe_bilat) == "gwabbrev"] <- "repgwabbrev"

# visually inspect to make sure it worked
countries <- ipe_bilat %>% 
  select(repgwno, year, repcountry, repccode, repifs, repifscode, repgwabbrev, pargwno, parcountry, parccode, parifs, parifscode, pargwabbrev)

sum(is.na(countries$repgwno))
sum(is.na(countries$year))
sum(is.na(countries$repccode))
sum(is.na(countries$repcountry))
sum(is.na(countries$pargwno))
sum(is.na(countries$parccode))
sum(is.na(countries$parcountry))
unique(countries$repcountry[is.na(countries$repccode)])
unique(countries$parcountry[is.na(countries$parccode)])

# add_name("Vietnam (Annam/Cochin China/Tonkin)", "vietnam")

# use append_ids instead 
ids_path <- "/Volumes/GoogleDrive-106129646335010644812/My Drive/append_ids/"
source(paste(ids_path, "append_ids.R", sep = ""))

options(mc.cores = parallel::detectCores())

ipe_bilat2 <- ipe_bilat %>% 
  append_ids(dyad = T, breaks = F)

# Save merged backup

#saveRDS(ipe_bilat2, file = paste(preppeddata, "bilat2_IPE_backup_tmp_GM_01102023.RDS", sep=""))
#ipe_bilat2 <- readRDS(paste(preppeddata, "bilat2_IPE_backup_tmp_GM_01102023.RDS", sep=""))

# other vars to remove 
ipe_bilat3 <- ipe_bilat2 %>% 
  dplyr::select(-c(numa_GET, numb_GET, acra_GET, acrb_GET)) %>% 
  dplyr::select(-contains("_na_")) %>% # removes na columns of unct data that's creating duplicates
  dplyr::select(-contains("_UNCT")) %>%  
  dplyr::select(-contains("_COWT")) %>% 
  full_join(cowt, by = c("repgwno", "pargwno", "year")) %>% 
  full_join(unct, by = c("repgwno", "pargwno", "year")) 

names(ipe_bilat3)

# recoding NAs in MIG_UN data which is driving duplicates 
  # NAs are recoded as 0, and this is noted in codebook
ipe_bilat3$migstock_MIG_UN <- ipe_bilat3$migstock_MIG_UN %>% replace(is.na(.), 0)

# Remove Raw Country Names, ccodes, etc
columnsToRemove = colnames(ipe_bilat3)
columnsToRemove = columnsToRemove[grep("*_raw_*", columnsToRemove)]
ipe_bilat3 = ipe_bilat3[,!(colnames(ipe_bilat3) %in% columnsToRemove)]

# Remove .y columns and unnecessary vars
ipe_bilat3 <- ipe_bilat3 %>% 
  dplyr::select(-contains(".y")) %>% 
  dplyr::select(-contains(".x")) %>% 
  select(-c(positionIN, positionOUT, contig_CEPII, stateA_ATOP, stateB_ATOP, version_ATOP,
            i_BACI, j_BACI))
           # ideal1, ideal2, absidealdiff, s2un, s3un, agree3un, iso_d_TH, iso_o_TH))

names(ipe_bilat3)

# Remove duplicates
#ipe_bilat3 <- ipe_bilat3[!(duplicated(ipe_bilat3)),]

# Remove duplicates now raw country names have been removed
ipe_bilat3 <- ipe_bilat3 %>% 
  group_by(repcountry, repgwno, parcountry, pargwno, year) %>% 
  distinct()

# Another go
ipe_bilat3 <- ipe_bilat3[!duplicated(ipe_bilat3[ , c("repgwno", "pargwno", "year")]), ] 

# Check Duplicates 
n_occur <- data.frame(table(ipe_bilat3$repgwno, ipe_bilat3$pargwno, ipe_bilat3$year))
n_occur[n_occur$Freq > 1,]

# Change the order of all columns so that country, year, gwno show up at the front
allColnames <- colnames(ipe_bilat3)
length(allColnames) #170
mainVector <- c("repcountry", "parcountry", "year","repgwno", "pargwno", "repccode", "repifscode", "repifs", "repgwabbrev", 
                "parccode", "parifscode", "parifs", "pargwabbrev")
leftCol <- allColnames[!(allColnames %in% mainVector)]
newCol <- c(mainVector, leftCol)
ipe_bilat3 <- ipe_bilat3[,newCol]
names(ipe_bilat3)

# use append_ids instead -- SKIPPING this step, somehow drops variables
# ids_path <- "/Volumes/GoogleDrive-106129646335010644812/My Drive/append_ids/"
# source(paste(ids_path, "append_ids.R", sep = ""))
# 
# ipe_bilat3 <- ipe_bilat3 %>% 
#   append_ids(dyad = T, breaks = F)

length(unique(ipe_bilat3$repcountry)) #193 countries 
min(ipe_bilat3$year) #1815
max(ipe_bilat3$year) #2021 

# Convert from character to numeric
str(ipe_bilat3)
ipe_bilat3$repccode <- as.numeric(ipe_bilat3$repccode)
ipe_bilat3$repifscode <- as.numeric(ipe_bilat3$repifscode)
ipe_bilat3$parccode <- as.numeric(ipe_bilat3$parccode)
ipe_bilat3$parifscode <- as.numeric(ipe_bilat3$parifscode)
ipe_bilat3$distw_CEPII <- as.numeric(ipe_bilat3$distw_CEPII)
ipe_bilat3$distwces_CEPII <- as.numeric(ipe_bilat3$distwces_CEPII)
ipe_bilat3$refugees_REF <- as.numeric(ipe_bilat3$refugees_REF)
ipe_bilat3$asylum_REF <- as.numeric(ipe_bilat3$asylum_REF)
ipe_bilat3$returned_REF <- as.numeric(ipe_bilat3$returned_REF)

# Order by gwno and year
#ipe = ipe[order(ipe$gwno, ipe_bilat$year),]
ipe_bilat3 <- ipe_bilat3 %>% arrange(repgwno, pargwno, year)

str(ipe_bilat3)
names(ipe_bilat3)

#remove extra PBL vars
ipe_bilat3 <- select(ipe_bilat3, -c(dyadid_PBL, posner_type_PBL,
                                    posner_PBL))
names(ipe_bilat3)

#cleanup name
ipe_bilat3 <- rename(ipe_bilat3,
                     "growth_par_WDI_PWT"= growth_WDI_PW_par,
                     "growth_rep_WDI_PWT" = growth_WDI_PW_rep)
names(ipe_bilat3)

# check <- ipe_bilat3 %>% 
#   filter(year < 1960) 

# --- Save
# Save an R version
save(ipe_bilat3, file = paste(preppeddata, "Bilateral_IPE_v3_02072023.RDATA", sep = ""))
saveRDS(ipe_bilat3, file = paste(preppeddata, "Bilateral_IPE_v3_02072023.RDS", sep = ""))

# Save a STATA-friendly version
library(haven)
write_dta(ipe_bilat3, paste(preppeddata, "Bilateral_IPE_v3.dta", sep = ""))
#write.table(ipe_bilat3, paste(preppeddata, "Bilateral_IPE_v3.tab", sep = ""))
write_csv(ipe_bilat3, file = paste(preppeddata,"Bilateral_IPE_v3_02072023.csv",sep=""))
